/*
MIT License

Copyright (c) 2021 МГТУ им. Н.Э. Баумана, кафедра ИУ-6, Михаил Фетисов,

https://bmstu.codes/lsx/simodo
*/

#include "simodo/inout/token/InputStream.h"

#include <cassert>
#include <cstdint>

namespace
{
    template <int size>
    struct UTF_BOM
    {
        const unsigned char bytes[size];
    };

    static constexpr UTF_BOM<3> UTF8_BOM = {
        {0xEF, 0xBB, 0xBF}
    };
    static constexpr UTF_BOM<2> UTF16BE_BOM = {
        {0xFE, 0xFF}
    };
    static constexpr UTF_BOM<2> UTF16LE_BOM = {
        {0xFF, 0xFE}
    };
    static constexpr UTF_BOM<4> UTF32BE_BOM = {
        {0x00, 0x00, 0xFE, 0xFF}
    };
    static constexpr UTF_BOM<4> UTF32LE_BOM = {
        {0xFF, 0xFE, 0x00, 0x00}
    };

    template <int bom_size>
    static bool skipUtfBom(
        std::istream & is
        , const UTF_BOM<bom_size> bom)
    {    
        int i = 0;
        for (
            ; is.peek() == bom.bytes[i] && i < bom_size
            ; is.get(), ++i
        );

        if (i == bom_size) return true;

        for (
            ; 0 < i
            ; is.unget(), --i
        );

        return false;
    }
}

namespace simodo::inout
{

char16_t InputStream::get()
{
    if (_surrogate_pair != 0)
    {
        char16_t ch = _surrogate_pair;
        _surrogate_pair = 0;
        return ch;
    }

    if (_in.eof())
        return std::char_traits<char16_t>::eof();

    if (_untouched)
    {
        skipUtfBom(_in, UTF8_BOM)
            || skipUtfBom(_in, UTF16BE_BOM)
            || skipUtfBom(_in, UTF16LE_BOM)
            || skipUtfBom(_in, UTF32BE_BOM)
            || skipUtfBom(_in, UTF32LE_BOM);

        _untouched = false;
    }

    int ch1 = _in.get();

    if (ch1 <= 0x7F)
        return static_cast<char16_t>(ch1);

    uint32_t code;
    int      count;

    if ((ch1 & 0b11100000) == 0b11000000)
    {
        count = 2;
        code = (ch1 & 0b00011111);
    }
    else if ((ch1 & 0b11110000) == 0b11100000)
    {
        count = 3;
        code = (ch1 & 0b00001111);
    }
    else
    {
        count = 4;
        code = (ch1 & 0b00000111);
    }

    for(int i=1; i < count; ++i)
    {
        int ch = _in.get();

        if (ch == std::char_traits<char16_t>::eof())
            break;

        code = (code << 6) + (static_cast<uint32_t>(ch) & 0b00111111);
    }

    if (code <= 0xD7FF ||
       ((code >= 0xE000) && (code <= 0xFFFF)) )
    {
        return static_cast<char16_t>(code);
    }
    /// \attention PVS Studio: V560 A part of conditional expression is always true: (code >= 0xD800).
    else if ((code >= 0xD800) && (code <= 0xDFFF))
    {
        // unicode replacement character
        return 0xFFFD;
    }
    else
    {
        // surrogate pair
        code -= 0x010000;
        _surrogate_pair = 0xD800 + ((code >> 10) & 0x3FF);
        return 0xDC00 + (code & 0x3FF);
    }
}

}